In [8]:
import zipfile
from urllib.request import urlopen
import os
source_url = 'ftp://ftp.nhtsa.gov/GES/GES12/GES12_Flatfile.zip'
zip_name = 'GES12_Flatfile.zip'
cwd = os.getcwd()
dir_path = os.path.join(cwd, 'GES2012')
zip_path = os.path.join(dir_path, zip_name)
# We'll make a directory for you to play around with,
# then when you're done playing you can just delete the directory
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# Download the file from GES website if you haven't already
if not os.path.exists(zip_path):
response = urlopen(source_url)
with open(zip_path, 'wb') as fh:
x = response.read()
fh.write(x)
# Extract all the files from that zipfile
with zipfile.ZipFile(os.path.join(dir_path, zip_name), 'r') as z:
z.extractall(dir_path)
In [10]:
#See what we just unzipped
os.listdir(dir_path)
Out[10]:
In [11]:
import pandas as pd
import numpy as np
import sklearn
cwd = os.getcwd()
dir_path = os.path.join(cwd, 'GES2012')
input_file_path = os.path.join(dir_path, 'PERSON.TXT')
input_data = pd.read_csv(input_file_path, delimiter='\t')
In [12]:
sorted(input_data.columns)
Out[12]:
In [13]:
input_data.INJSEV_IM.value_counts()
Out[13]:
In [15]:
# Drop those odd cases
input_data = input_data[input_data.INJSEV_IM != 6]
for column_name in input_data.columns:
n_nans = input_data[column_name].isnull().sum()
if n_nans > 0:
print (column_name, n_nans)
In [21]:
print (input_data.shape)
data = input_data[~input_data.MAKE.isnull()]
discarded = data.pop('INJ_SEV')
target = data.pop('INJSEV_IM')
print (data.shape)
In [22]:
target = (target == 4).astype('float')
In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
# Train on half of the data while reserving the other half for
# model comparisons
xtrain, xtest, ytrain, ytest = sklearn.cross_validation.train_test_split(
data.values, target.values, train_size=0.5)
linreg = LinearRegression()
linreg.fit(xtrain, ytrain)
lr_preds = linreg.predict(xtest)
lr_perf = roc_auc_score(ytest, lr_preds)
print ('OLS: Area under the ROC curve = {}'.format(lr_perf))
In [27]:
from sklearn.linear_model import Ridge
ridge = GridSearchCV(Ridge(),
{'alpha': np.logspace(-10, 10, 10)})
ridge.fit(xtrain, ytrain)
ridge_preds = ridge.predict(xtest)
ridge_performance = roc_auc_score(ytest, ridge_preds)
print ('Ridge: Area under the ROC curve = {}'.format(ridge_performance))
In [29]:
from sklearn.linear_model import Lasso
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
lasso = GridSearchCV(Lasso(),
{'alpha': np.logspace(-10, -8, 5)})
lasso.fit(xtrain, ytrain)
lasso_preds = lasso.predict(xtest)
lasso_performance = roc_auc_score(ytest, lasso_preds)
print ('Lasso: Area under the ROC curve = {}'.format(lasso_performance))
In [30]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
gbm = GradientBoostingClassifier(n_estimators=500)
gbm.fit(xtrain, ytrain)
gbm_preds = gbm.predict_proba(xtest)[:, 1]
gbm_performance = roc_auc_score(ytest, gbm_preds)
print ('GBM: Area under the ROC curve = {}'.format(gbm_performance))
In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
tree = GridSearchCV(DecisionTreeClassifier(),
{'max_depth': np.arange(3, 10)})
tree.fit(xtrain, ytrain)
tree_preds = tree.predict_proba(xtest)[:, 1]
tree_performance = roc_auc_score(ytest, tree_preds)
print ('DecisionTree: Area under the ROC curve = {}'.format(tree_performance))
In [33]:
importances = pd.Series(gbm.feature_importances_, index=data.columns)
print (importances.order(ascending=False)[:10])
In [ ]: